Structural analysis

Giacomo Mutti

27-May-2024

library(tidyverse)
library(patchwork)
library(ggtree)
library(cowplot)
library(glue)
source("workflow/scripts/functions.R")
theme_set(theme_classic())

models <- names(palettes_model)

params <- yaml::read_yaml("config/params.yaml")
params_dataset <- yaml::read_yaml("config/Hsapopi_test.yaml")

outdir <- glue(params$outdir, params_dataset$dataset)
seeds <- readLines(glue(outdir, "/ids/UP000005640_common.ids"))

tax <- read_delim(params_dataset$taxons_file, show_col_types=F)
taxmap <- read_delim(glue(outdir, "/db/taxidmap"), 
                     delim = "\t", col_names = c("label", "Tax_ID"),
                     show_col_types = FALSE) %>% 
  left_join(tax, by = "Tax_ID") %>% 
  dplyr::select(label, Proteome_ID, mnemo)

taxidmap <- read_delim(glue(outdir,"/db/taxidmap"), 
                       col_names = c("target", "Tax_ID"), show_col_types=F)

sptree <- read.tree(params_dataset$species_tree_labels)
translate <- deframe(distinct(dplyr::select(taxmap, Proteome_ID, mnemo)))
# sptree$tip.label <- names(translate)[match(translate, sptree$tip.label)]

species_dist_df <- tibble()
for (sp in sptree$tip.label){
  if (sp=="Hsap") {next}
  dist <- castor::get_all_distances_to_tip(sptree, "Hsap")[MRCA(sptree, "Hsap", sp)]
  mrca <- c(sptree$tip.label, sptree$node.label)[MRCA(sptree, "Hsap", sp)]
  species_dist_df <- bind_rows(species_dist_df, tibble(mnemo=sp, value=dist, mrca=mrca))
}

# read input table
lvls_tax <- rev(c(unique(tax$Clade), "archaea", "bacteria"))

table_columns <- c('Proteome_ID', 'Tax_ID', 'count1', 'count2', 'count3', 
                   'genome', 'source', 'species', 'mnemo')

table <- read_delim(params_dataset$input_table_file, show_col_types=F, 
                    delim = "\t", col_names = table_columns) %>% 
  left_join(tax, by = "Tax_ID") %>% 
  mutate(Clade = ifelse(is.na(Clade), mnemo, Clade),
         Clade = factor(Clade, levels = lvls_tax))

Homology

blast_self <- read_delim(glue(outdir, "/homology/allvall/",
                              params_dataset$seed, "_", params_dataset$seed, "_blast.tsv"),
                         show_col_types=FALSE,
                         col_names = blast_columns) %>% 
  filter(query %in% seeds) %>%
  filter(query==target) %>% 
  mutate(blast_selfbit=bitscore) %>% 
  select(query, blast_selfbit)

fs_self <- read_delim(glue(outdir, "/homology/allvall/",
                              params_dataset$seed, "_", params_dataset$seed, "_fs.tsv"), 
                      show_col_types = FALSE,
                         col_names = blast_columns) %>% 
  filter(query %in% seeds) %>%
  filter(query==target) %>% 
  mutate(fs_selfbit=bitscore) %>% 
  select(query, fs_selfbit)

self <- full_join(blast_self, fs_self, by="query")
fs <- read_delim(glue(outdir, "/homology/", params_dataset$seed, "_fs.tsv"), 
                 col_names = fs_columns, show_col_types = FALSE) %>% 
  filter(query %in% seeds) %>%
  mutate(evalue = ifelse(evalue==0, 1e-180, evalue)) %>%
  left_join(taxidmap, by = "target")

fs_brhs <- read_delim(glue(outdir, "/homology/", params_dataset$seed, "_fs_brh.tsv"),
                     col_names = c("query", "target"), show_col_types = FALSE) %>% 
  filter(query %in% seeds) %>%
  mutate(method="fs_brh") 
blast <- read_delim(glue(outdir, "/homology/", params_dataset$seed, "_blast.tsv"), 
                    col_names = blast_columns, show_col_types = FALSE) %>% 
  filter(query %in% seeds) %>%
  mutate(evalue = ifelse(evalue==0, 1e-180, evalue))

blast_brhs <- read_delim(glue(outdir, "/homology/", params_dataset$seed, "_blast_brh.tsv"),
                     col_names = c("query", "target"), show_col_types = FALSE) %>% 
  filter(query %in% seeds) %>%
  mutate(method="blast_brh")
df <- full_join(blast, fs,
                by = c("query", "target"), suffix=c("_blast", "_fs")) %>%
    filter(query!=target) %>% 
  left_join(self, by="query") %>%
  mutate(pident_fs = 100*pident_fs, 
         qcov_fs = 100*qcov_fs,
         singleton = case_when(is.na(evalue_fs) ~ "only_blast",
                               is.na(evalue_blast) ~ "only_fs", 
                               .default = "common"),
         blast_norm = bitscore_blast/blast_selfbit, fs_norm = bitscore_fs/fs_selfbit) 

brhs <- rbind(blast_brhs, fs_brhs) %>% 
    group_by(query, target, method) %>%
    mutate(n=1) %>%
    pivot_wider(names_from = method, values_fill = 0, values_from = n) %>%
    mutate(common = ifelse(fs_brh+blast_brh==2, 1, 0))

# grouped_brhs <- brhs %>%
#   group_by(query) %>%
#   summarise(fs_brh = sum(fs_brh), blast_brh=sum(blast_brh), common_brh=sum(common))

Blast vs Foldseek

There are these number of hits, divided by singleton class:

df %>% count(singleton)
## # A tibble: 3 × 2
##   singleton       n
##   <chr>       <int>
## 1 common     126086
## 2 only_blast 169528
## 3 only_fs    243979

We can explore different properties of blast and foldseek search divided by singleton class.

Blast vs foldseek properties
Blast vs foldseek properties

BRHs

We can explore the abundance of BRH hits for every seed protein across the different species.

brhs %>% 
  left_join(select(taxmap, -Proteome_ID), by=c("target"="label")) %>% 
  mutate(mnemo=factor(mnemo, levels=sptree$tip.label)) %>% 
  # pivot_longer(cols = c(blast_brh, fs_brh, common)) %>% 
  filter(mnemo!="Hsap") %>% 
  pivot_longer(cols = c(blast_brh, fs_brh, common)) %>% 
  filter(value!=0) %>% 
  ggplot(aes(mnemo, fill=name)) + 
  geom_bar(position = "dodge") + 
  scale_fill_manual(values = palette_brh) + 
  scale_y_continuous(expand = expansion(0,0))

# select(df, query, target, singleton) %>% 
#   left_join(brhs) %>% 
#   group_by(singleton) %>%
#   summarise(#prop_brh = sum(!is.na(common))/n()*100,
#             prop_blast_brh = sum(blast_brh, na.rm = T)/n()*100, 
#             prop_fs_brh = sum(fs_brh, na.rm = T)/n()*100, 
#             prop_common_brh = sum(common, na.rm = T)/n()*100) %>% 
#   pivot_longer(!singleton) %>% 
#   ggplot(aes(name, value, color=singleton)) + 
#   geom_point(size=3) + 
#   scale_color_manual(values=palette_singleton) + 
#   labs(x="Type of BRH", y="% of BRH")

Singletons structure quality

lddt_df <- read_delim(paste0(params$structure_dir, "/", tax$Proteome_ID, "/", tax$Proteome_ID, "_mean_plddt.tsv"),
           col_names = c("target", "mean_lddt"), show_col_types = FALSE)

df %>% 
  select(target, singleton) %>% 
  mutate(target=gsub("AF-|-F1", "", target)) %>% 
  left_join(lddt_df, by="target") %>% 
  ggplot(aes(mean_lddt, fill=singleton)) + 
  geom_density(alpha=.5) +
  scale_y_continuous(expand = expansion(0.01,0)) +
  scale_x_continuous(expand = expansion(0.01,0)) +
  scale_fill_manual(values = palette_singleton) 

We can see the meand plddt for singleton classes and common are generally better imputed proteins than the two singletons. Expectedly fs suffers more than blast.

We can also see more in detail some blast and foldseek features and how they differ.

Foldseek and blast properties divided by singleton class
Foldseek and blast properties divided by singleton class

Post filtering blast

aln_seeds <- paste0("AF-",readLines(glue(outdir, "/ids/",params_dataset$seed,"_aln.ids")), "-F1")
blast_filtered <- filter(blast, 
                         query %in% aln_seeds,
                         length/qlen*100>params$coverage, 
                         length/slen*100>params$coverage,
                         evalue<as.numeric(params$eval_both)) %>% 
  group_by(query) %>% 
  slice_head(n = params$max_seqs) %>% 
  ungroup()

fs_filtered <- filter(fs, 
                      query %in% aln_seeds,
                      qcov*100>params$coverage, 
                      tcov*100>params$coverage, 
                      evalue<as.numeric(params$eval_both)) %>% 
  group_by(query) %>% 
  slice_head(n = params$max_seqs) %>% 
  ungroup()

We filter blast and foldseek by 50 query and target coverage and by 1e-3 min evalue for our 1000. Only 721 will go in the phylogenetic pipeline as they have more than 4 common hits. This results in 39683 blast rows and 57611 fs rows.

Originations

blast_originations <- blast_filtered %>% 
  select(query, target) %>% 
  left_join(taxmap, by=c("target"="label")) %>% 
  group_by(query) %>% 
  filter(mnemo!="Hsap") %>% 
  left_join(species_dist_df, by="mnemo") %>% 
  mutate(mrca=factor(mrca, levels=unique(species_dist_df$mrca), ordered = TRUE)) %>% 
  summarise(mrca=max(mrca)) %>% 
  group_by(mrca) %>% 
  count() %>% 
  mutate(method="blast")

fs_originations <- fs_filtered %>% 
  select(query, target) %>% 
  left_join(taxmap, by=c("target"="label")) %>% 
  group_by(query) %>% 
  filter(mnemo!="Hsap") %>% 
  left_join(species_dist_df, by="mnemo") %>% 
  mutate(mrca=factor(mrca, levels=unique(species_dist_df$mrca), ordered = TRUE)) %>% 
  summarise(mrca=max(mrca)) %>% 
  group_by(mrca) %>% 
  count() %>% 
  mutate(method="fs")

originations <- rbind(blast_originations, fs_originations)

originations %>% 
  ggplot(aes(n, fct_rev(mrca), fill=method)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  scale_fill_manual(values = palettes_method) + 
  scale_x_continuous(expand = expansion(0,0)) + 
  labs(y="MRCA")

We can see where is the “taxonomic mrca” (the most distant species) for every seed. Interestingly, foldssek is more than blast just in the two last nodes.

Duplications

duplications_mrca <- read_delim(glue(outdir, "/reco/", params_dataset$seed,"_mrca.tsv"),
                                show_col_types = FALSE) %>% 
  separate(id, c("gene", "target", "alphabet", "model")) %>% 
  select(-alphabet) %>% 
  group_by(target, model, mrca) %>% 
  summarise(n=sum(n)) %>% 
  filter(model %in% c("LG", "FTPY")) %>% 
  mutate(mrca=factor(mrca, levels=fortify(sptree) %>% arrange(y) %>% pull(label), ordered = TRUE))
## `summarise()` has grouped output by 'target', 'model'. You can override using
## the `.groups` argument.
duplications_mrca %>% 
  filter(target %in% c("blast", "fs")) %>%
  ggplot(aes(n, mrca, fill=target)) + 
  geom_bar(position = "dodge", stat="identity") +
  facet_grid(~model) + 
  scale_x_continuous(expand = expansion(0,0)) +
  scale_fill_manual(values=palettes_method) 

# fortify(sptree) %>% 
#   inner_join(duplications_mrca, by=c("label"="mrca")) %>% 
#   ggtree() + 
#   geom_point(aes(size=n)) + 
#   facet_grid(target~model)

We can also explore where the duplications called by RangerDTL are mapped in the species tree. For this we are only interested in LG and FTPY to reduce complexity.

Trees

df_trees <- read_delim(glue(outdir, "/trees/", params_dataset$seed, "_unrooted_trees.txt"),
                       show_col_types = FALSE,
                       col_names = c("gene", "target", "alphabet", "model", "tree")) %>% 
  mutate(model=factor(model, levels=models))

Reconciliation

all 6 model results
all 6 model results
Astral-PRO results
Astral-PRO results
Distance 2 seed by singleton class
Distance 2 seed by singleton class

RF distance + varr2t

Various Tree analysis
Various Tree analysis

Model fit

How many times is each model better?

reco <- read_delim(glue(outdir, "/reco/", params_dataset$seed, "_DTL.tsv"), show_col_types = FALSE) %>%
  mutate(model=factor(model, levels=models))

scores <- read_delim(glue(outdir, "/reco/", params_dataset$seed, "_scores.tsv"), show_col_types = FALSE) %>%
  left_join(reco) %>%
  mutate(model=factor(model, levels=models))
## Joining with `by = join_by(id, targets, alphabet, model)`
reco_prop_plot <- scores %>%
  group_by(id, targets) %>%
  mutate(norm_reco_score=(dups+losses)/n_tips) %>%
  slice_min(norm_reco_score, with_ties = TRUE) %>%
  mutate(best_score=1/n()) %>% 
  ggplot(aes(targets, best_score, fill=model)) +
  geom_bar(stat = "identity") +
  labs(subtitle = "How many times each model has the lowest D+L/tips?") +
  scale_fill_manual(values = palettes_model)

tcs_prop_plot <- scores %>%
  group_by(id, targets) %>%
  slice_max(score, with_ties = TRUE) %>%
  mutate(best_score=1/n()) %>% 
  ggplot(aes(targets, best_score, fill=model)) +
  geom_bar(stat = "identity") +
  labs(subtitle = "How many times each model has the highest TCS score?") +
  scale_fill_manual(values = palettes_model)

reco_prop_plot + tcs_prop_plot +  plot_layout(guides = 'collect')

Distance between target classes

We can see the average normalized topological distance between the different groups in LG and FTPY.

df_trees_dist <- filter(df_trees, model %in% c("LG", "FTPY"), target=="union")
ts_dist <- read.tree(text = df_trees_dist$tree)
names(ts_dist) <- paste(df_trees_dist$gene, df_trees_dist$target, df_trees_dist$alphabet, df_trees_dist$model, sep = "_")

df_distance <- tibble()

for (idx in 1:length(ts_dist)) {
    seed <- paste0("AF-",df_trees_dist$gene[idx],"-F1")
    tmp_df <- select(filter(df, query==seed), target, singleton)

    tree_dist <- as.matrix(adephylo::distTips(ts_dist[[idx]], method = "nNodes")) %>% 
      as.data.frame() %>% 
      rownames_to_column() %>% 
      pivot_longer(!rowname) %>% 
      filter(rowname>name) %>%
      mutate(value=value/max(value)) %>% 
      left_join(tmp_df, by = c("rowname"="target")) %>% 
      left_join(tmp_df, by = c("name"="target")) %>% 
      mutate(a=pmin(singleton.x, singleton.y),
             b=pmax(singleton.x, singleton.y)) %>% 
      group_by(a, b) %>%
      filter(!is.na(a), !is.na(b)) %>%
      summarise(mean_dist=mean(value), n=n(), .groups = "keep") %>%
      mutate(gene=seed, model=df_trees_dist$model[idx])

  df_distance <- bind_rows(df_distance, tree_dist)
}

df_distance %>% 
  group_by(a,b,model) %>% 
  summarise(wmean=weighted.mean(mean_dist,n)) %>% 
  ggplot(aes(a,b, fill=wmean)) + 
  geom_tile() +
  geom_text(aes(label=round(wmean,2))) +
  facet_grid(~model) +
  scale_fill_gradientn(colors=wespal) +
  coord_cartesian(expand = F) +
  theme(axis.title = element_blank(),
        legend.position = "none")
## `summarise()` has grouped output by 'a', 'b'. You can override using the
## `.groups` argument.

CATH/SCOPe

Runtimes

Various runtimes statistics
Various runtimes statistics

TODOs

  • CATH/SCOPe
  • Benchmark: fix runtimes!
  • Fix FTPY branch support
  • Foldtree smk vs Foldtree python: DONE
  • duplication in species tree: DONE